This should be done whenever you start a new r session. For this script we add several more functions that are useful for data processing, quality control, and figure plotting.
library(minfi)
library(MASS)
library(abind)
library(sva)
library(Hmisc)
library(ggplot2)
# Setting file paths for data, output, scripts
# The rest of this script assumes that your data are in a folder called "project" on the Cloud.
# It also assumes that your ouptut will be stored in a subfolder called "output" on the Cloud.
# As you work on your own computer, you will need to specify the folder locations.
# Folder location of the data files
data_dir <- "/cloud/project/Data"
data_dir
## [1] "/cloud/project/Data"
# Folder location to put the output files
output_dir <- "/cloud/project/output/"
output_dir
## [1] "/cloud/project/output/"
There are various quality control steps that need to be executed before we can say that the data is clean
load(file.path(data_dir, "RGset.rda"))
pd <- pData(RGset)
Here we extract the signal intensity from each sample. Low signal intensity is sign of a poor sample. Additionally, we can stratify signal intensity by variables such as batch to get an idea about the technical noise in our sample.
# MethylSet (Mset) contains metylated and unmethylated signals made using preprocessRaw()
rawMSet <- preprocessRaw(RGset)
## Loading required package: IlluminaHumanMethylation450kmanifest
rawMSet
## class: MethylSet
## dim: 485512 17
## metadata(0):
## assays(2): Meth Unmeth
## rownames(485512): cg00050873 cg00212031 ... ch.22.47579720R
## ch.22.48274842R
## rowData names(0):
## colnames(17): GSM1051870_7766130158_R03C02 GSM1052024_5730053010_R01C02
## ... GSM1052032_5730053011_R06C01 GSM1052037_5730053011_R06C02
## colData names(11): GEOID celltype ... Batch filenames
## Annotation
## array: IlluminaHumanMethylation450k
## annotation: ilmn12.hg19
## Preprocessing
## Method: Raw (no normalization or bg correction)
## minfi version: 1.36.0
## Manifest version: 0.4.0
# save(rawMSet, file = "rawMSet.rda")
# M signal per probe, per sample
Meth <- getMeth(rawMSet)
Meth[1:5, 1:5]
## GSM1051870_7766130158_R03C02 GSM1052024_5730053010_R01C02
## cg00050873 514 270
## cg00212031 170 400
## cg00213748 312 490
## cg00214611 181 349
## cg00455876 295 497
## GSM1052035_5730053011_R04C02 GSM1051874_7766130166_R02C01
## cg00050873 26002 1073
## cg00212031 342 242
## cg00213748 1997 237
## cg00214611 312 349
## cg00455876 5458 681
## GSM1051871_7766130158_R05C02
## cg00050873 396
## cg00212031 420
## cg00213748 286
## cg00214611 262
## cg00455876 470
# U signal per probe, per sample
Unmeth <- getUnmeth(rawMSet)
Unmeth[1:5, 1:5]
## GSM1051870_7766130158_R03C02 GSM1052024_5730053010_R01C02
## cg00050873 432 348
## cg00212031 494 463
## cg00213748 299 476
## cg00214611 362 459
## cg00455876 1183 922
## GSM1052035_5730053011_R04C02 GSM1051874_7766130166_R02C01
## cg00050873 5595 571
## cg00212031 8375 272
## cg00213748 688 358
## cg00214611 5644 269
## cg00455876 3139 1248
## GSM1051871_7766130158_R05C02
## cg00050873 355
## cg00212031 478
## cg00213748 301
## cg00214611 576
## cg00455876 1312
# Set up color palettes for plotting
myColors <- c("dodgerblue", "firebrick1", "seagreen3")
graphColors <- c(
"#023FA5", "#7D87B9", "#BEC1D4", "#D6BCC0", "#BB7784", "#D33F6A", "#11C638", "#8DD593", "#C6DEC7",
"#EAD3C6", "#F0B98D", "#EF9708", "#0FCFC0", "#9CDED6", "#D5EAE7", "#F3E1EB", "#F6C4E1", "#F79CD4",
"#4A6FE3", "#8595E1", "#B5BBE3", "#E6AFB9", "#E07B91"
)
# Overall intensity: M vs. U
pd$MQC <- log2(colMedians(Meth))
pd$UQC <- log2(colMedians(Unmeth))
pd$Array <- factor(pd$Array)
palette(graphColors)
plot(pd$UQC, pd$MQC, main = "M vs. U QC", pch = 16, xlab = "Log2 Median Unmethylated Intensity", ylab = "Log2 Median Methylated Intensity", cex.lab = 1.2, cex.main = 2)
plot(pd$UQC, pd$MQC, col = as.factor(pd$Slide), main = "M vs. U QC by Slide", pch = 16, xlab = "Log2 Median Unmethylated Intensity", ylab = "Log2 Median Methylated Intensity", cex.lab = 1.2, cex.main = 2)
legend("bottomright", levels(as.factor(pd$Slide)), fill = graphColors)
plot(pd$UQC, pd$MQC, col = pd$Array, main = "M vs. U QC by Position", pch = 16, xlab = "Log2 Median Unmethylated Intensity", ylab = "Log2 Median Methylated Intensity", cex.lab = 1.2, cex.main = 2)
legend("bottomright", levels(pd$Array), fill = graphColors)
palette(myColors)
plot(pd$UQC, pd$MQC, col = pd$Batch, main = "M vs. U QC by Batch", pch = 16, xlab = "Log2 Median Unmethylated Intensity", ylab = "Log2 Median Methylated Intensity", cex.lab = 1.2, cex.main = 2)
legend("bottomright", c("Batch 1", "Batch 2"), fill = myColors)
rm(Meth, Unmeth)
Here we have no low intensity samples. Also note that the cutpoint of 11 for UQC and MQC is not set in stone as that value; it may depend on your data.
# Drop (or if really small sample: watch out for): Samples with UQC<11 & MQC<11
# Note the cutoff value (here, 11) would depend on your data and array (EPIC/450k)
which(pd$UQC < 11)
## integer(0)
length(which(pd$UQC < 11))
## [1] 0
which(pd$MQC < 11)
## integer(0)
length(which(pd$MQC < 11))
## [1] 0
We can check conflicts between the reported and predicted sex of each sample. Predicted sex is derived from looking at the signal intensities of the sex chromosomes. This can help us to identify contaminated sample and poorly assayed samples.
GmRawSet <- mapToGenome(rawMSet)
## Loading required package: IlluminaHumanMethylation450kanno.ilmn12.hg19
sex <- getSex(GmRawSet)
colData(GmRawSet) <- sex
# save(sex, file=file.path(data_dir, "Estimate-Sex.rda"))
pd <- merge(pd, as.matrix(sex), by = "row.names", sort = FALSE)
rownames(pd) <- pd$Basename
pd <- pd[, -1]
table(pd$predictedSex, pd$gender)
##
## F M
## F 10 0
## M 0 7
plotSex(GmRawSet)
rm(GmRawSet, sex)
# Raw density plot
beta.raw <- getBeta(rawMSet)
mvalue.raw <- getM(rawMSet)
type <- getProbeType(rawMSet)
probe.type <- data.frame(Name = rownames(beta.raw), Type = type)
densityPlot(beta.raw, sampGroups = pd$casestatus, main = "Raw Beta by Case Status")
densityPlot(beta.raw, sampGroups = pd$Batch, main = "Raw Beta by Batch")
plotBetasByType(beta.raw[, 1], probeTypes = probe.type, main = "Raw Beta by Probe Type, Sample 1")
densityPlot(mvalue.raw, sampGroups = pd$Batch, main = "Raw M-value by Batch", xlab = "M-value")
Principal components show us main driving variables behind variation in the DNA methylation data
beta.raw2 <- beta.raw
beta.raw2[is.na(beta.raw2)] <- 0
rm(rawMSet, beta.raw, mvalue.raw, type, probe.type)
# makes the principal component object
prin <- prcomp(t(beta.raw2), center = T, scale. = F)
# save(prin, file=file.path(data_dir, "princomp.rda"))
# pulls out proportion of variance explained by each PC
out.var <- prin$sdev^2 / sum(prin$sdev^2) # the percent variance at each PC
out.var[1:10]
## [1] 0.32829751 0.14178446 0.09810244 0.06415463 0.04391130 0.03810846
## [7] 0.03432127 0.03244897 0.03101745 0.03058116
sum(out.var)
## [1] 1
# Raw PCA plots
screeplot(prin, col = "dodgerblue", xlab = "Principal Components of Raw Beta Values", main = "", cex.lab = 1.3)
plot.new()
palette(myColors)
legend("bottom", legend = levels(factor(pd$gender)), fill = myColors, title = "Principal components by sex")
pairs(prin$x[, 1:6], col = factor(pd$gender), labels = paste0("PC", 1:6), pch = 1, cex = 0.5)
plot.new()
palette(myColors)
legend("bottom", legend = levels(factor(pd$Batch)), fill = myColors, title = "Principal components by batch")
pairs(prin$x[, 1:6], col = factor(pd$Batch), labels = paste0("PC", 1:6), pch = 1, cex = 0.5)
plot.new()
palette(myColors)
legend("bottom", legend = levels(factor(pd$Array)), fill = graphColors, title = "Principal components by array")
pairs(prin$x[, 1:6], col = factor(pd$Array), labels = paste0("PC", 1:6), pch = 1, cex = 0.5)
plot.new()
palette(myColors)
legend("bottom", legend = levels(factor(pd$Slide)), fill = myColors, title = "Principal components by slide")
pairs(prin$x[, 1:6], col = factor(pd$Slide), labels = paste0("PC", 1:6), pch = 1, cex = 0.5)
rm(phenos, prin, beta.raw2, out.var)
DetectionP gives us a metric for assessing signal/noise ratios. Both samples and probes with higher detection p levels (indicative of unacceptable levels of noise) are cut or flagged.
detP <- detectionP(RGset)
dim(detP)
## [1] 485512 17
detP[1:5, 1:5]
## GSM1051870_7766130158_R03C02 GSM1052024_5730053010_R01C02
## cg00050873 1.028592e-02 0.960841448
## cg00212031 2.324817e-01 0.640617366
## cg00213748 3.325807e-01 0.409574003
## cg00214611 4.798274e-01 0.750000321
## cg00455876 5.603033e-08 0.002416194
## GSM1052035_5730053011_R04C02 GSM1051874_7766130166_R02C01
## cg00050873 0.000000e+00 2.829189e-14
## cg00212031 0.000000e+00 6.031609e-01
## cg00213748 7.477842e-18 3.836341e-01
## cg00214611 9.323309e-150 3.248249e-01
## cg00455876 0.000000e+00 1.304945e-21
## GSM1051871_7766130158_R05C02
## cg00050873 2.420704e-01
## cg00212031 7.499246e-02
## cg00213748 5.500699e-01
## cg00214611 1.276492e-01
## cg00455876 1.940226e-09
# save(detP,file=file.path(data_dir, "detection-P.rda"))
failed <- detP > 0.01
per.samp <- colMeans(failed) # Fraction of failed positions per sample (length: 42)
summary(per.samp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0002245 0.0006900 0.0009124 0.0028833 0.0026343 0.0183827
per.probe <- rowMeans(failed) # Fraction of failed samples per position (length: 485512)
summary(per.probe)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.002883 0.000000 0.882353
sum(per.samp > 0.01) # How many samples had more than 1% of sites fail?
## [1] 2
sum(per.probe > 0.1) # How many positions failed in >10% of samples?
## [1] 5049
# Identify samples that are failed
probe.fail <- failed[per.probe > 0.1, ]
length(probe.fail)
## [1] 85833
sample.fail <- per.samp[per.samp > 0.01]
sample.fail
## GSM1052032_5730053011_R06C01 GSM1052037_5730053011_R06C02
## 0.01151980 0.01838266
hist(per.samp, breaks = 40, col = "dodgerblue", cex.lab = 1.3, xlab = "Fraction of failed positions per sample")
abline(v = 0.01, col = "red")
hist(per.probe, breaks = 40, col = "dodgerblue", cex.lab = 1.3, xlab = "Fraction of failed samples per probe", ylim = c(0, 500))
abline(v = 0.1, col = "red", lwd = 2)
RGset.drop <- RGset[, !colnames(RGset) %in% names(sample.fail)] # Drop samples that failed by detection p.
dim(RGset.drop)
## [1] 622399 15
rm(RGset, detP, failed, per.samp, per.probe)
Cross-reactive probes are shown to ‘co-hybridize’ onto sex chromosomes and may show spurious sex differences in methylation that are merely due to technical artifacts
load(file.path(data_dir, "cross.probes.info.rda"))
dim(cross.probes.info)
## [1] 29233 5
head(cross.probes.info)
## TargetID 47 48 49 50
## 1 cg00001510 2 0 1 0
## 2 cg00003969 0 0 2 0
## 3 cg00004121 0 4 1 0
## 4 cg00004192 0 1 0 0
## 5 cg00004209 0 2 1 0
## 6 cg00004771 3 0 0 0
# rec is to use the 47 cross probe cutoff
Noob uses control probes to correct for background flouresence
# Noob
noob <- preprocessNoob(RGset.drop, offset = 15, dyeCorr = TRUE, verbose = TRUE)
## [dyeCorrection] Applying R/G ratio flip to fix dye bias
noob.dropP <- noob[!rownames(noob) %in% rownames(probe.fail), ]
noob.dropCross <- noob.dropP[!rownames(noob.dropP) %in% cross.probes.info$TargetID, ]
noob <- noob.dropCross
save(noob, file = file.path(data_dir, "noob.rda"))
rm(noob.dropP, noob.dropCross, cross.probes.info)
beta.n <- getBeta(noob)
pd <- pData(noob)
rm(noob)
Gap probes are probes for which the methylation beta clusters into discrete groups– these typically have their methylation driven by underlying SNPs
gaps <- gaphunter(beta.n)
## [gaphunter] Using 451,445 probes and 15 samples.
## [gaphunter] Searching for gap signals.
## [gaphunter] Found 80,470 gap signals.
## [gaphunter] Filtering out gap signals driven by outliers.
## [gaphunter] Removed 0 gap signals driven by outliers from results.
str(gaps)
## List of 3
## $ proberesults :'data.frame': 80470 obs. of 10 variables:
## ..$ Groups: num [1:80470] 2 2 2 4 8 3 2 2 2 2 ...
## ..$ Group1: num [1:80470] 6 6 7 1 4 1 5 6 6 6 ...
## ..$ Group2: num [1:80470] 9 9 8 1 2 12 10 9 9 9 ...
## ..$ Group3: num [1:80470] 0 0 0 7 3 2 0 0 0 0 ...
## ..$ Group4: num [1:80470] 0 0 0 6 1 0 0 0 0 0 ...
## ..$ Group5: num [1:80470] 0 0 0 0 1 0 0 0 0 0 ...
## ..$ Group6: num [1:80470] 0 0 0 0 2 0 0 0 0 0 ...
## ..$ Group7: num [1:80470] 0 0 0 0 1 0 0 0 0 0 ...
## ..$ Group8: num [1:80470] 0 0 0 0 1 0 0 0 0 0 ...
## ..$ Group9: num [1:80470] 0 0 0 0 0 0 0 0 0 0 ...
## $ sampleresults: num [1:80470, 1:15] 2 2 2 3 1 2 2 2 2 2 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:80470] "cg01707559" "cg03244189" "cg04689676" "cg04964672" ...
## .. ..$ : NULL
## $ algorithm :List of 3
## ..$ threshold : num 0.05
## ..$ outCutoff : num 0.01
## ..$ keepOutliers: logi FALSE
#Get a probe in the list as an example and plot the beta distribution.
gap1 <- beta.n[rownames(gaps$sampleresults)[42],]
gap1 <- data.frame(gap1)
ggplot(gap1, aes(x = rownames(gaps$sampleresults)[42], y = gap1)) +
geom_jitter(width = 0.01) +
labs(title = sprintf("Methylation Beta Values for probe %s", rownames(gaps$sampleresults)[42]), x = " ", y = "beta") +
coord_cartesian(ylim = c(0, 1)) +
theme_bw()
Cell type proportions are a strong confounder of DNA methylation. Different cell types have varying levels of methylation in different probes so we can use these probes to disentangle cell types proportions from unrelated DNA methylation differences.
# Cell type
# library(FlowSorted.Blood.450k)
# cell <- estimateCellCounts(RGset.drop)
# save(cell, file=file.path(data_dir,"cell-type-estimates.rda"))
# cell estimation may take more memory than available in RStudio Cloud, if an issue load premade cell type estimates
load(file.path(data_dir, "Premade_Intermediate_Files/cell-type-estimates.rda"))
dim(cell)
## [1] 15 6
head(cell)
## CD8T CD4T NK Bcell
## GSM1051870_7766130158_R03C02 0.04188973 0.084594790 0.05023918 0.04773192
## GSM1052024_5730053010_R01C02 0.11315767 0.125408728 0.03213701 0.05810510
## GSM1052035_5730053011_R04C02 0.17453771 0.175923277 0.03048209 0.10661716
## GSM1051874_7766130166_R02C01 0.08986933 0.165085488 0.02269505 0.04247854
## GSM1051871_7766130158_R05C02 0.04901293 0.070227977 0.04387917 0.03731421
## GSM1051872_7766130158_R06C02 0.02868823 0.006123224 0.01649349 0.02748160
## Mono Gran
## GSM1051870_7766130158_R03C02 0.07119778 0.7225386
## GSM1052024_5730053010_R01C02 0.09360675 0.6262584
## GSM1052035_5730053011_R04C02 0.06070866 0.5079184
## GSM1051874_7766130166_R02C01 0.09390137 0.5949493
## GSM1051871_7766130158_R05C02 0.07663316 0.7413125
## GSM1051872_7766130158_R06C02 0.03591401 0.9019293
identical(rownames(cell), colnames(beta.n)) # A quick sanity check to make sure our samples are in the right order.
## [1] TRUE
pd.n <- data.frame(pd, cell)
rm(RGset.drop)
prin.cell <- prcomp(t(cell), center = T, scale. = F)
out.var <- prin.cell$sdev^2 / sum(prin.cell$sdev^2)
out.var
## [1] 9.834555e-01 1.149609e-02 3.112862e-03 1.457190e-03 4.783495e-04
## [6] 3.776973e-33
screeplot(prin.cell, col = "dodgerblue", xlab = "Principal Components of Estimated Cell Type", main = "", cex.lab = 1.3)
myColors <- c("seagreen3", "dodgerblue", "darkorchid", "firebrick1", "darkorange", "khaki1")
palette(myColors)
par(mar = c(0, 0, 0, 0))
plot.new()
legend("bottom", c("Bcell", "CD4T", "CD8T", "Gran", "Mono", "NK"), fill = myColors, title = "Principal Components by Estimated Cell Type")
pairs(prin.cell$x[, 1:6], col = as.factor(rownames(prin.cell$x)), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 15)
summary(pd.n$Gran)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5079 0.6106 0.7225 0.6965 0.7389 0.9019
hist(pd.n$Gran, breaks = 20, col = "dodgerblue", xlab = "Estimated percent granulocytes", cex.lab = 1.3)
hist(pd.n$Mono, breaks = 20, col = "dodgerblue", xlab = "Estimated percent monocytes", cex.lab = 1.3)
hist(pd.n$NK, breaks = 20, col = "dodgerblue", xlab = "Estimated percent NK cells", cex.lab = 1.3)
hist(pd.n$CD8T, breaks = 20, col = "dodgerblue", xlab = "Estimated percent CD8T", cex.lab = 1.3)
hist(pd.n$CD4T, breaks = 20, col = "dodgerblue", xlab = "Estimated percent CD4T", cex.lab = 1.3)
# Pick cutoffs for biological ranges for cell type estimates.
pd.n.drop <- pd.n[pd.n$Mono < 0.2, ]
beta.n <- beta.n[, colnames(beta.n) %in% rownames(pd.n.drop)]
dim(beta.n)
## [1] 451445 15
# Noob PCA plots
pd.n.drop$gran.quart <- cut2(pd.n.drop$Gran, g = 4)
prin <- prcomp(t(beta.n), center = T, scale. = F)
out.var <- prin$sdev^2 / sum(prin$sdev^2)
out.var[1:10]
## [1] 0.27005828 0.18834184 0.09414191 0.06102141 0.04786377 0.04435116
## [7] 0.04268365 0.04147590 0.03885570 0.03711819
screeplot(prin, col = "dodgerblue", xlab = "Principal Components of Noob Beta Values", main = "", cex.lab = 1.3)
plot.new()
palette(myColors)
par(mar = c(0, 0, 0, 0))
legend("bottom", levels(as.factor(pd.n.drop$gender)), fill = myColors, title = "Principal Components by Sex")
pairs(prin$x[, 1:6], col = as.factor(pd.n.drop$gender), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)
plot.new()
legend("bottom", levels(as.factor(pd.n.drop$Batch)), fill = myColors, title = "Principal Components by Batch")
pairs(prin$x[, 1:6], col = as.factor(pd.n.drop$Batch), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)
plot.new()
legend("bottom", levels(as.factor(pd.n.drop$casestatus)), fill = myColors, title = "Principal Components by Case Status")
pairs(prin$x[, 1:6], col = as.factor(pd.n.drop$casestatus), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)
plot.new()
palette(graphColors)
legend("bottom", levels(as.factor(pd.n.drop$Array)), fill = graphColors, title = "Principal Components by Array Position")
pairs(prin$x[, 1:6], col = as.factor(pd.n.drop$Array), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)
plot.new()
legend("bottom", levels(as.factor(pd.n.drop$Slide)), fill = graphColors, title = "Principal Components by Slide")
pairs(prin$x[, 1:6], col = as.factor(pd.n.drop$Slide), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)
plot.new()
legend("bottom", levels(as.factor(pd.n.drop$gran.quart)), fill = graphColors, title = "Principal Components by Quartile of Gran")
pairs(prin$x[, 1:6], col = as.factor(pd.n.drop$gran.quart), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)
# Drop samples with missing data at key covariates
table(pd.n.drop$gender, useNA = "always")
##
## F M <NA>
## 9 6 0
table(pd.n.drop$casestatus, useNA = "always")
##
## Control RA <NA>
## 5 10 0
table(pd.n.drop$smoking, useNA = "always")
##
## current ex never occasional <NA>
## 4 5 4 2 0
table(pd.n.drop$Batch, useNA = "always")
##
## 1 2 <NA>
## 5 10 0
# No samples missing data, we're okay. But run this line anyway.
pd.complete <- pd.n.drop[!(is.na(pd.n.drop$smoking)), ] # Example of how to remove.
save(pd.complete, file = file.path(data_dir, "pd-complete.rda"))
beta.noob.complete <- beta.n[, colnames(beta.n) %in% rownames(pd.complete)]
rm(beta.n)
mod <- model.matrix(~ pd.complete$gender + pd.complete$casestatus + pd.complete$smoking)
# combat.beta <- ComBat(dat = beta.noob.complete, batch = pd.complete$Batch, mod = mod)
# if ComBat requires more memory than available, load premade results:
load(file.path(data_dir, "Premade_Intermediate_Files/combat-beta.rda"))
combat.beta[1:5, 1:5]
## GSM1051870_7766130158_R03C02 GSM1052024_5730053010_R01C02
## cg01707559 0.1597848 0.1462628
## cg03244189 0.1943061 0.1996871
## cg04689676 0.1552875 0.1546703
## cg04964672 0.6965052 0.6376027
## cg13851368 0.2522998 0.2511689
## GSM1052035_5730053011_R04C02 GSM1051874_7766130166_R02C01
## cg01707559 0.05731516 0.2056190
## cg03244189 0.08506135 0.2456049
## cg04689676 0.03855749 0.1420821
## cg04964672 0.88477145 0.5154691
## cg13851368 0.80565715 0.3034519
## GSM1051871_7766130158_R05C02
## cg01707559 0.2304606
## cg03244189 0.2356330
## cg04689676 0.1885954
## cg04964672 0.6739231
## cg13851368 0.2614161
# save the cleaned matrix.
save(combat.beta, file = file.path(data_dir, "combat-beta.rda"))
prin <- prcomp(t(combat.beta), center = T, scale. = F)
out.var <- prin$sdev^2 / sum(prin$sdev^2)
out.var[1:10]
## [1] 0.24201673 0.18800841 0.09044395 0.07922183 0.06261984 0.05290151
## [7] 0.04745881 0.04666915 0.04532922 0.04157885
screeplot(prin, col = "dodgerblue", xlab = "Principal Components of Noob Beta Values", main = "", cex.lab = 1.3)
plot.new()
palette(myColors)
par(mar = c(0, 0, 0, 0))
legend("bottom", levels(as.factor(pd.complete$gender)), fill = myColors, title = "Principal Components by Sex")
pairs(prin$x[, 1:6], col = as.factor(pd.complete$gender), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)
plot.new()
legend("bottom", levels(as.factor(pd.complete$Batch)), fill = myColors, title = "Principal Components by Batch")
pairs(prin$x[, 1:6], col = as.factor(pd.complete$Batch), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)
plot.new()
legend("bottom", levels(as.factor(pd.complete$casestatus)), fill = myColors, title = "Principal Components by Case Status")
pairs(prin$x[, 1:6], col = as.factor(pd.complete$casestatus), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)
plot.new()
palette(graphColors)
legend("bottom", levels(as.factor(pd.complete$Array)), fill = graphColors, title = "Principal Components by Array Position")
pairs(prin$x[, 1:6], col = as.factor(pd.complete$Array), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)
plot.new()
legend("bottom", levels(as.factor(pd.complete$Slide)), fill = graphColors, title = "Principal Components by Slide")
pairs(prin$x[, 1:6], col = as.factor(pd.complete$Slide), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)
plot.new()
legend("bottom", levels(as.factor(pd.complete$gran.quart)), fill = graphColors, title = "Principal Components by Quartile of Gran")
pairs(prin$x[, 1:6], col = as.factor(pd.complete$gran.quart), labels = c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6"), pch = 1, cex = 0.5)